/*==============================================================================
CLEANING OF LONESTATISTIK

Data 	: 
Folder 	: 
Date	: 
Creator		: Jonas Cederlof	(JC)
Description 	: 	
Notes		: 	

LATEST UPDATE : 

==============================================================================*/

clear
set more		 off
cap   log close 	_all
log using 	"../log/A1_clean_wagesurvey.log", replace 
use 		"$datapath/A0_wagesurvey_00_19.dta"


{ // Rename and label variables
*===============================================================================
rename lopnr 		persid
rename lopnr_peorgnr	firmid
rename sector		sample_WSsector

lab var sample_WSsector "1=kommun, 2=landsting, 3=statlig, 4=privat"

}
*

{ // Fix sekt varaible
*===============================================================================
replace sekt ="Pk" if sekt=="" & sample_WSsector==1
replace sekt ="Lt" if sekt=="" & sample_WSsector==2
replace sekt ="St" if sekt=="" & sample_WSsector==3

*Tabulate sector against sekt
tab sekt sample_WSsector,m
*Note: The reason we have 70 Ar and 674 Tj in sector 3 is due to duplicates
*handled in code line 49-65 in A0_append_wagesurvey where people work in both 
*sector 3 and 4 for the same employer (some kind of meassurement error).
}
*

// Fill in missing values of monthly wage (from other wage variables)
tab year sample_WSsector if manl==.

*Just differntly named variables across sectors but is the same
*grlon is 2008-2018 for sector 1 and 2.
*glon is 2014-2018 for sector 3 and 4  
tab year sample_WSsector if glon!=.
tab year sample_WSsector if grlon!=.
replace grlon = glon if grlon==. & glon!=.
drop glon


*CPI adjust
merge m:1 year 	using "$datapath/cpi_jc.dta"
drop if _merge==2
drop 	_merge

foreach var of varlist 	manl fastlon ovklon rorllon grlon rortup skift jourberers{
	qui replace `var' = `var'/cpi
}
drop cpi

save "$datapath/A1_clean_wagesurvey_00_19.dta",replace

{ // Generate white and blue collar indicators
*===============================================================================
*Note:  I create two seperate datasets which contains indicators for which type 
*	of "collar" a workers has. The first dataset is a by persid/year/firmid-
*	specific collar whereas the second dataset is persid/year specific.

// Persid/Year/Firmid - specific  	
preserve
	encode sekt, gen(sector_firm)
	gcollapse (max) sector_firm ,by(year  persid firmid)
	
	drop if firmid==.
	compress
	save "$datapath/workcat_firmspec.dta",replace
restore	

// Persid/Year/ - specific 
preserve
	encode sekt, gen(sector)
	gcollapse (max) sector ,by(year  persid)
	
	compress
	save "$datapath/workcat_byyear.dta",replace
restore	
}
*
{ // Generate dataset uniqe at persid and year (highest paying employer)
*===============================================================================
// Fixing duplicates and saving relevant variabels	
duplicates tag persid year, gen(multiplejobs_WS)
sum multiplejobs_WS 	// About 4.3% have mutiple jobs within a year

sort persid year manl
drop if manl==.
fcollapse (last) manl_notfirm_specific=manl  multiplejobs_WS firmid tjomf_notfirm_specific=tjomf arbtim_notfirm_specific=arbtim,by(persid year)
	

rename  firmid 			firmid_lonestruktur
	
lab var manl_notfirm_specific	"Highest monthly wage from wage survey in a year"
lab var tjomf_notfirm_specific	"Share of fulltime for highest paid job in wage survey"	
lab var arbtim_notfirm_specific "Hours worked for highest paid job in wage survey"	
lab var multiplejobs_WS 	"Number of jobs in wage survey in a year"


save "$datapath/A1_clean_wagesurvey_00_19_persidyear.dta", replace
}
*

log close



/*
*rortip always missing for sector 4. Whereas rorllon is never missing for sector 4

*Calculate monthly wage for those with missing values

*Note:  fastlon verkar redan ha tagit hansyn till tjomf. For ovklon maste man 
*	dividera med 0.tjomf. I.e
*	manl = (fastlon) + rortup +skift + jourberers 
*	manl = (ovklon/0.tjomf) + rortup + skift + jourberers
*Note:  fastlon for sektor 1 and 2 in year 2000-2007
*	ovklon for sektor 3 and 4 in year 2000-2007
*	grlon for sektor 1 and 2 in year 2008-2018
*	ovklon 

*fastlon=grlon (for sector 1 and 2)
 tab year sector if fastlon!=.
 tab year sector if grlon!=.
 replace grlon = fastlon if year<=2007
 drop fastlon
 
 *Same variable nut for different sectors
*Rörliga lönetillägg (rortup)
*Rörliga lönetillägg = (ob + jour + bered + civvar + fardtid + fskarbtid +
*offupp). För deltidsanställda räknas lönetilläggen upp till motsvarande vid
*heltid.
tab year sector if rortup!=. 
tab year sector if rorllon!=. 
 
manl = faslon + rortup if sector==1 & year<2008
manl = (grlon) + skift + jourberers = (ovklon + rorlon) + skift + jourberers [if sector==4]

glon =approx=fastlon=ovklon/.tjomf +rorllon (glon comes from older leverans of wage data)

2000-2007
fastlon = sektor 1 och 2 
ovklon = sektor 3 och 4 

2008-2014
grlon =  sektor 1 och 2 (fastlon missing)
ovklon = sektor 3 och 4 

2014-2018
grlon =  sektor 1 och 2 (fastlon missing)
ovklon = sektor 3 och 4 
manl = glon + skift + jourtillag = (ovklon + rorlon) + skift + jourtillag
manl = glon + skift + jourtillag = (ovklon + rorlon) + skift + jourtillag (galler for sektor 4?)

*verkar vara lite grejer med att detta inte galler for ej heltidare
*ie glon = ovklon + rorlon

*from 2014 and onwards we do have manl

*manl = glon + skift + jourtillag = ovklon + rorlon + skift + jourtillag

*glon = ovklon + rorlon
*/
